import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.Writer;
import java.net.URL;
import java.util.ArrayList;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


public class NOLACrimeNewsCrawl {

	private String folderName = "/media/Utils/UTA/CSE6339 Advance DB/Project/News Articles/NOLA Crime News/";
	private String processFileDirectory="/media/Utils/UTA/CSE6339 Advance DB/Project/News Articles/Processed NOLA Crime News/";
	//private String folderName ="C:\\Users\\lakshman\\workspace\\cse6339-faceted-interface\\News Articles\\NOLA Crime News\\";
	//private String processFileDirectory="C:\\Users\\lakshman\\workspace\\cse6339-faceted-interface\\Processed Articles\\Processed NOLA Crime Articles";
	
	private FileIO file = new FileIO();
	/**
	 * @param args
	 */
	public static void main(String[] args) {
		NOLACrimeNewsCrawl news =new NOLACrimeNewsCrawl();
		//news.crawlNOLAArticles();
		news.processArticles();
	}
	
	
	public void processArticles(){
		File root = new File(folderName);
		if(root!=null && root.isDirectory())
		{
			File files[] =root.listFiles();
			if(files!=null && files.length>0){
				for(File f: files){
					StringBuffer contents = file.readFromFile(f);
					String outFilePath=file.createProcessFile(processFileDirectory+"/"+f.getName());
					if(contents!=null){
						String content =removeHTMLTags(contents);
						file.writeToFile(content, outFilePath);
					}break;
				}
			}
			else 
				System.out.println("Root directory is empty, no articles found");
		}
		else
			System.out.println("Root doesn't exist or it's not a directory at all");
	}

	
	public String removeHTMLTags(StringBuffer contents){
		String newValue=null;
		if(contents!=null && contents.length()>0){
			String value = contents.toString();
			System.out.println("Original Content: \n"+value);
			newValue =value.replaceAll("\\<.*?\\>", "").replaceAll("&.*?;", "");
			System.out.println("New Content: \n"+newValue);		
		}
		return newValue;
		
		
	}

	
	
	private void crawlNOLAArticles()
	{
		//Penn archive & NOLA archive(Loisisna)
		String url1 = "http://www.nola.com/crime/index.ssf/2012/03/index_10.html";
		ArrayList<String> urls = new ArrayList<String>();
		urls.add(url1);
		for(String url : urls)
		{
			String retrievedText = downloadPage(url);
			if(!retrievedText.isEmpty()){
		        ArrayList<String> links = retrieveNOLALinks(retrievedText);
		        int articleCnt = 135;
		        if(links != null)
		        {
			        for(String link : links)
			        {
			        	System.out.println(articleCnt+" "+link);
			        	String newsContent =  downloadPage(link);
			        	String title =downloadArticleTitle(newsContent);
			        	String content = downloadArticleBody(newsContent);
			        	
			        	if(!content.isEmpty()){
				        	String article=link+" \n "+title +" \n "+content;
				        	//System.out.println(title +"\n"+content);
				        	String filename = folderName + title +"_"+articleCnt + ".txt";
				        	file.writeToFile(article, filename);
				        	//System.out.println(articleCnt);
				        	articleCnt++;
			        	}		        	
			        }
		        }
		        else
		        {
		        	System.out.println("ERROR: No links found in page: " + url);
		        }
			}
		}
	}

	private ArrayList<String> retrieveNOLALinks(String fullText)
	{
	
		if(fullText.isEmpty())
			System.out.println("No text in the webpage.");
		ArrayList<String> links = new ArrayList<String>();
		//Pattern p = Pattern.compile("<li><a\\s+href\\s*=\\s*\"?(.*?)[\"|>]?(.*?)</a></li></ul>", Pattern.CASE_INSENSITIVE);
		// NOLA & Penn pattern
		Pattern p = Pattern.compile("<h2\\s*class=\"entry-title\"><a\\s*href\\s*=\\s*\"?(.*?)[\"|>]?(.*?)</a></h2>", Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(fullText);
		while(m.find())
		{
			String link = m.group().trim();
			//System.out.println(link);
			String hrefLink=null;
			if(link!=null){
				hrefLink = link.substring(link.indexOf("href=")+6,link.lastIndexOf("\">"));
				links.add(hrefLink);
			}
			//System.out.println(hrefLink);
		}
		return links;
	
	}

	private String downloadArticleBody(String content)
	{
		String fullBody = "";
		try
		{
			//CNN Article
			//Pattern p = Pattern.compile("<p.*</p>", Pattern.MULTILINE);
			//FT
			Pattern p = Pattern.compile("<div\\s*class=\"entry-content\">(\\s*|.*)+<div\\s*class=\"clear\">", Pattern.CASE_INSENSITIVE);
			Matcher m = p.matcher(content);
			while(m.find())
			{
				String data = m.group().trim();
				if(data!=null)
					fullBody+=data;
				System.out.println(fullBody);
			}
        }
        catch (Exception ex)
        {
            ex.printStackTrace();
        }
        return " LA: "+fullBody;
	}

	public String downloadArticleTitle(String content){
		String title = "";
		try
		{
			Pattern p = Pattern.compile("<title>.*</title>", Pattern.MULTILINE);
			Matcher m = p.matcher(content);
			while(m.find())
			{
				String data = m.group().trim();
				if(data!=null){
					title=data.substring(data.indexOf("<title>")+7,data.lastIndexOf("|") );
					title =title.replace('?', ' ');
					title =title.replace(':', ' ');
					title =title.replace('\"', ' ');
				}
					System.out.println(title);
			}
        }
        catch (Exception ex)
        {
            ex.printStackTrace();
        }
        return title.trim();
	}
	
	
	
	
	private String downloadPage(String url)
	{
		String fullText = "";
		try
		{
            URL my_url = new URL(url);
            BufferedReader br = new BufferedReader(new InputStreamReader(my_url.openStream()));
            String text = "";
            while(null != (text = br.readLine()))
            {
            	fullText += text + "\n";
            }
        }
        catch (Exception ex)
        {
            ex.printStackTrace();
        }
		//System.out.println(fullText);
        return fullText;
        
	}
	
}
